In [1]:
#importing python libraries
import numpy as np 
import pandas as pd 
In [2]:
df = pd.read_csv('insurance.csv')
df.head()
Out[2]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
In [4]:
df.describe()
Out[4]:
age bmi children charges
count 1338.000000 1338.000000 1338.000000 1338.000000
mean 39.207025 30.663397 1.094918 13270.422265
std 14.049960 6.098187 1.205493 12110.011237
min 18.000000 15.960000 0.000000 1121.873900
25% 27.000000 26.296250 0.000000 4740.287150
50% 39.000000 30.400000 1.000000 9382.033000
75% 51.000000 34.693750 2.000000 16639.912515
max 64.000000 53.130000 5.000000 63770.428010
In [5]:
df.size
Out[5]:
9366

EDA (Exploration Data Analysis)

In [6]:
s_s = df.groupby(by=['sex','smoker']).mean()
s_s
Out[6]:
age bmi children charges
sex smoker
female no 39.691042 30.539525 1.087751 8762.297300
yes 38.608696 29.608261 1.008696 30678.996276
male no 39.061896 30.770580 1.092843 8087.204731
yes 38.446541 31.504182 1.188679 33042.005975
In [7]:
s_s = pd.pivot_table(data=df, columns='sex', values="age", index='children')
s_s
Out[7]:
sex female male
children
0 38.346021 38.543860
1 39.506329 39.403614
2 40.512605 38.396694
3 42.181818 40.975000
4 42.000000 36.642857
5 37.000000 34.500000
In [8]:
s_s = pd.pivot_table(data=df, columns='region', values="charges", index='smoker', aggfunc= 'max')
s_s
Out[8]:
region northeast northwest southeast southwest
smoker
no 32108.66282 33471.97189 36580.28216 36910.60803
yes 58571.07448 60021.39897 63770.42801 52590.82939
In [9]:
df[(df['smoker'] == 'yes') & (df["region"] == 'northeast') & (df['charges'] == 58571.07448)].head()
Out[9]:
age sex bmi children smoker region charges
577 31 female 38.095 1 yes northeast 58571.07448
In [10]:
df[(df['smoker'] == 'yes') & (df["region"] == 'northeast') & (df['charges'] > 5000) & (df['sex'] == 'male')].head()
Out[10]:
age sex bmi children smoker region charges
38 35 male 36.670 1 yes northeast 39774.27630
92 59 male 29.830 3 yes northeast 30184.93670
98 56 male 19.950 0 yes northeast 22412.64850
123 44 male 31.350 1 yes northeast 39556.49450
157 18 male 25.175 0 yes northeast 15518.18025
In [11]:
df[(df['smoker'] == 'yes') & (df["region"] == 'northeast') & (df['charges'] > 20000) & (df['sex'] == 'male')].head()
Out[11]:
age sex bmi children smoker region charges
38 35 male 36.670 1 yes northeast 39774.27630
92 59 male 29.830 3 yes northeast 30184.93670
98 56 male 19.950 0 yes northeast 22412.64850
123 44 male 31.350 1 yes northeast 39556.49450
185 36 male 41.895 3 yes northeast 43753.33705

EDA- Visualizations

In [12]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [13]:
sns.countplot(data=df, x = 'sex')
Out[13]:
<AxesSubplot:xlabel='sex', ylabel='count'>
In [14]:
sns.jointplot(data=df, x='age', y='charges')
Out[14]:
<seaborn.axisgrid.JointGrid at 0x27d16712af0>
In [15]:
male = df[(df['charges'] >= 1600) & (df['sex'] == 'male')]
male.head()
Out[15]:
age sex bmi children smoker region charges
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
8 37 male 29.830 2 no northeast 6406.41070
In [16]:
female = df[(df['charges'] >= 1600) & (df['sex'] == 'female')]
female.head()
Out[16]:
age sex bmi children smoker region charges
0 19 female 27.90 0 yes southwest 16884.92400
5 31 female 25.74 0 no southeast 3756.62160
6 46 female 33.44 1 no southeast 8240.58960
7 37 female 27.74 3 no northwest 7281.50560
9 60 female 25.84 0 no northwest 28923.13692
In [17]:
fig, ax = plt.subplots(figsize=(14, 5))
df[(df['charges'] >= 1600) & (df['sex'] == 'male')]['charges'].plot()
Out[17]:
<AxesSubplot:>
In [18]:
#Barchart
fig, ax = plt.subplots(figsize=(140, 5))
df[(df['charges'] >= 1600) & (df['sex'] == 'male')]['charges'].plot(kind="bar")
Out[18]:
<AxesSubplot:>
In [19]:
pd.DataFrame(df[(df['charges'] >= 1600) & (df['sex'] == 'female')]['charges']).plot()
Out[19]:
<AxesSubplot:>
In [20]:
#Barchart
fig, ax = plt.subplots(figsize=(140, 5))
df[(df['charges'] >= 1600) & (df['sex'] == 'female')]['charges'].plot(kind='bar')
Out[20]:
<AxesSubplot:>
In [21]:
#Partitioning into quartiles
pd.qcut(male['charges'], q=100)
Out[21]:
1         (1710.039, 1771.385]
2          (4447.07, 4522.661]
3       (21230.311, 22030.743]
4         (3864.992, 4033.738]
8         (6397.702, 6599.067]
                 ...          
1324      (4239.035, 4407.564]
1325    (12960.454, 13143.685]
1327      (9331.882, 9504.273]
1329    (10220.693, 10423.281]
1333    (10586.342, 10739.625]
Name: charges, Length: 652, dtype: category
Categories (100, interval[float64, right]): [(1621.339, 1637.687] < (1637.687, 1694.95] < (1694.95, 1710.039] < (1710.039, 1771.385] ... (43916.891, 45907.758] < (45907.758, 47265.568] < (47265.568, 48674.519] < (48674.519, 62592.873]]
In [22]:
fig, ax = plt.subplots(figsize=(10, 8))
df['age'].value_counts().plot(kind='bar')
Out[22]:
<AxesSubplot:>

From Samples

In [23]:
# Distribution of Age

import plotly.express as px
import plotly.graph_objects as go

fig = px.histogram(df,
                   nbins = 100,
                   x = 'age',
                   color='sex',
                   )

fig.update_layout(title = 'Distribution of Age',
                  height = 700)

fig.show()

Feature Engineering

In [24]:
df.head()
Out[24]:
age sex bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
In [25]:
df['sex'] = df['sex'].astype('category')
df['sex'] = df['sex'].cat.codes
In [26]:
df['smoker'] = df['smoker'].astype('category')
df['smoker'] = df['smoker'].cat.codes
df.head()
Out[26]:
age sex bmi children smoker region charges
0 19 0 27.900 0 1 southwest 16884.92400
1 18 1 33.770 1 0 southeast 1725.55230
2 28 1 33.000 3 0 southeast 4449.46200
3 33 1 22.705 0 0 northwest 21984.47061
4 32 1 28.880 0 0 northwest 3866.85520

Training and Testing model

Linear Regression model

In [27]:
from sklearn.model_selection import train_test_split
In [28]:
X= df.drop(['region', 'charges', 'sex'], axis=1)
y= df['charges']
In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
In [30]:
from sklearn.linear_model import LinearRegression
In [31]:
lr = LinearRegression()
In [32]:
lr.fit(X_train, y_train)
Out[32]:
LinearRegression()
In [33]:
lr.coef_
Out[33]:
array([  262.31060687,   331.74780875,   379.02452728, 23699.63589053])
In [34]:
X.columns
Out[34]:
Index(['age', 'bmi', 'children', 'smoker'], dtype='object')
In [35]:
cdf = pd.DataFrame(lr.coef_, X.columns, columns=['Coeff'])
cdf
Out[35]:
Coeff
age 262.310607
bmi 331.747809
children 379.024527
smoker 23699.635891
In [36]:
predictions = lr.predict(X_test)
predictions
Out[36]:
array([ 8.48006203e+03,  6.96853565e+03,  3.69728391e+04,  9.31049341e+03,
        2.68392121e+04,  1.10958507e+04,  1.94880246e+01,  1.69290641e+04,
        6.46901186e+02,  1.10744225e+04,  2.83669588e+04,  9.22723399e+03,
        5.36873437e+03,  3.85188017e+04,  4.05796772e+04,  3.72708277e+04,
        1.53365498e+04,  3.61254658e+04,  9.31712306e+03,  3.14643164e+04,
        3.94794502e+03,  1.04186683e+04,  2.63531950e+03,  6.54233946e+03,
        1.11476369e+04,  1.25725757e+04,  1.49492691e+04,  5.87087793e+03,
        9.47742387e+03,  2.28675446e+03,  9.29501776e+03,  1.30758632e+04,
        4.64285342e+03,  3.19104515e+03,  4.71781915e+03,  1.25754834e+04,
        2.21197400e+03,  9.14392468e+03,  3.32901712e+04,  3.27656497e+04,
        3.93329608e+03,  4.11643626e+03,  1.45112746e+04,  1.14889096e+04,
        8.84056782e+03,  1.25446997e+04,  4.98953519e+03,  3.32259509e+03,
        3.56153188e+04,  9.19176083e+03,  1.61342983e+04,  2.41033317e+03,
        1.21787276e+04,  9.54787007e+02,  1.36777556e+04,  1.20922861e+04,
        3.88190200e+03,  3.19434300e+04,  1.37611522e+04,  1.24993614e+04,
        1.46414331e+04,  1.04986174e+04,  1.66804011e+04,  7.63009812e+03,
        1.14491497e+04,  3.89388552e+03,  2.68164600e+04,  1.11016519e+04,
        1.97237619e+03,  6.44115638e+03,  1.02043685e+04,  1.11060758e+04,
        1.12143482e+04,  9.28317144e+03,  1.21132974e+04,  6.93536087e+03,
        6.59574690e+03,  1.07650099e+04,  6.61670832e+03,  9.03325416e+03,
        3.86965118e+03,  3.63851859e+04,  6.60435776e+03,  3.02223056e+04,
        3.48590781e+04,  3.49393248e+04,  7.04595469e+03,  1.28664024e+04,
        9.78337124e+03,  1.47477901e+04,  1.72255240e+04,  3.56806814e+04,
        3.26404550e+04,  5.62713062e+03,  3.21611792e+04,  9.76134796e+03,
        2.96363485e+04,  3.76017441e+03,  2.81198067e+04,  5.28966721e+03,
        5.23792382e+03,  2.13769778e+03,  1.16771862e+04,  1.53688480e+04,
        1.14147260e+04,  4.34442290e+03,  1.00473406e+04,  3.21019118e+04,
       -5.89499042e+02,  3.31377970e+04,  3.53103858e+03,  1.02016104e+04,
        1.39062946e+04,  3.11528957e+04,  1.10097264e+04,  4.13919893e+03,
        1.27912496e+04,  3.21609921e+04,  8.24765860e+03,  3.05507311e+03,
        7.89448438e+03,  1.04248436e+04,  1.46969661e+04,  5.75821557e+03,
        3.57955713e+03,  1.00466511e+04,  1.10481571e+04,  1.06945660e+04,
        1.47201333e+04,  7.50297920e+03,  5.55768811e+03,  9.26981420e+03,
        9.40168130e+03,  1.20712267e+04,  8.67772637e+03,  1.57294786e+04,
        7.91498613e+03,  3.22354002e+04,  3.57291643e+04,  3.11826105e+04,
        5.92850627e+03,  1.21953525e+04,  6.13686774e+03,  1.44133217e+04,
        2.66323835e+03,  3.33699332e+04,  6.08871975e+03,  5.35177463e+03,
        1.41518431e+04,  7.24448851e+03,  3.85336804e+04,  2.92817692e+03,
        5.93598036e+03,  3.11955632e+04,  1.13704620e+04,  7.98820048e+03,
        1.44819643e+04,  1.00173408e+04,  2.72339919e+04,  3.30104037e+04,
        1.40962228e+04,  1.47005797e+03,  1.36049564e+04,  1.73056733e+03,
        5.58465547e+03,  1.16757627e+04,  4.02983811e+04,  3.65548868e+04,
        3.35869055e+04,  4.17600836e+03,  7.86475717e+03,  8.95722656e+03,
        1.19471010e+04,  4.68081734e+03,  2.32748355e+03,  3.24032553e+04,
        2.54392363e+04,  1.77833147e+04,  2.63114088e+04,  1.02079034e+04,
        3.69941729e+04, -9.67828744e+02,  6.76621225e+03,  8.04377090e+03,
        3.77966768e+03,  4.92386983e+03,  5.38532176e+03,  4.62501709e+03,
        1.53087183e+04,  1.11645361e+04,  7.04434582e+03,  1.96330035e+03,
        1.02703214e+03,  3.21980438e+04,  1.67113096e+04,  1.22567258e+04,
        1.06034943e+03,  1.20699831e+04,  1.01781909e+03,  9.17455883e+03,
        1.76111415e+03,  3.39826921e+04,  1.08635399e+04,  2.35333917e+03,
        2.57952683e+04,  2.65590651e+04,  9.44327455e+03,  1.53378488e+03,
        1.34768682e+04,  1.40342870e+03,  1.09289453e+04,  1.08523262e+04,
        1.63377387e+04,  2.70630920e+04,  7.10050069e+03,  4.70131908e+03,
        5.98006969e+03,  1.34122255e+04,  1.14426947e+04,  8.47527289e+03,
        4.92483905e+03,  1.22186247e+04,  1.38450762e+04,  3.59296472e+04,
        3.98954007e+03,  2.91203880e+04, -6.15809029e+02,  2.69835158e+03,
        1.12855368e+04,  1.56931841e+04,  5.29971408e+03,  6.93217348e+03,
        4.09233733e+03,  3.17678656e+04,  7.28903221e+03,  1.26444111e+04,
        5.38712301e+03,  9.89593035e+03,  3.64336741e+04,  4.54024143e+03,
        9.38751707e+03,  3.14080243e+04,  5.56437294e+03,  4.69127932e+03,
        9.01937196e+02,  4.69210604e+03,  4.79209537e+03,  6.78930450e+03,
        1.86107688e+04, -1.65086219e+03,  2.63223186e+03,  1.09064801e+04,
        3.47007504e+03,  9.89274296e+03,  3.48708465e+03,  5.19530616e+03,
        1.28924327e+04,  6.01661182e+03,  7.88880623e+03,  6.95163640e+03,
        8.71362707e+03,  1.05637929e+04,  2.80187555e+04,  3.94325193e+04,
        1.17938877e+04,  7.21034451e+03,  4.11588569e+04,  1.26689251e+04,
        7.00899034e+03,  7.96298981e+03,  9.45368666e+03,  1.10771057e+04,
        9.99075639e+03,  1.78494344e+04,  1.39858968e+03,  2.31416529e+04,
        1.21156955e+04,  3.26962125e+04,  4.68068014e+03,  1.33587307e+04,
        1.03470575e+04,  1.74018047e+04,  1.00867886e+04,  1.13657531e+04,
        3.25475282e+04,  2.82247730e+03,  1.38399949e+04,  3.95311795e+04,
        4.98579549e+03,  5.94256544e+03,  2.87726556e+03,  1.18401026e+04,
        2.50628878e+04,  1.34918075e+04,  9.33017376e+03,  9.70461593e+03,
        1.36214137e+04,  1.33399150e+03,  2.64264397e+03,  3.08810496e+04,
        3.04023136e+04,  1.36257503e+04,  3.45640063e+03,  2.52642420e+04,
        1.36535694e+04,  3.09222383e+04,  2.96066219e+03,  3.92934151e+04,
        1.12194295e+04,  4.98718162e+03,  7.07364370e+03,  2.62928673e+03,
        2.58287603e+04,  1.48435695e+04,  8.51807987e+02,  1.30859634e+04,
        1.29410456e+04,  1.48824598e+04,  3.51065955e+04,  1.42334385e+04,
        3.19581111e+04,  1.02730716e+04,  1.84324892e+04,  6.23150851e+03,
        9.20764628e+03,  9.64947305e+03,  1.54261645e+04,  9.50497037e+03,
        7.83352795e+03,  1.53928918e+04,  1.24445731e+04,  1.43466871e+04,
        7.74519787e+03,  2.62851932e+04,  9.45115845e+03,  1.70923356e+03,
        4.46366504e+03,  1.44935381e+04,  3.59430045e+04,  9.97813322e+03,
        1.27902804e+04,  4.91050548e+03,  4.77799874e+03,  4.15196461e+03,
        2.27846076e+03,  8.92045459e+03,  7.29162272e+03,  2.68816931e+03,
        1.33778142e+04,  8.83525671e+03,  6.20008510e+03,  1.07113389e+03,
        9.88224352e+03,  5.13784716e+03,  3.28981243e+04,  2.86503253e+04,
        3.72700830e+04,  5.99135838e+03,  8.84272369e+03,  8.61183649e+03,
        3.85969875e+03,  3.13075682e+04,  6.64208653e+03,  2.87059759e+04,
        3.59301069e+04,  7.31438539e+03,  1.34232292e+04,  9.62869871e+03,
        8.29934680e+03,  1.22396841e+04,  2.98995286e+04,  1.73943929e+04,
        1.16048591e+04,  3.81970194e+03, -7.73756276e+02,  1.18050392e+04,
        3.12635217e+04,  1.32105934e+04,  1.16042445e+04,  7.71764605e+03,
        3.07543766e+03,  7.50734612e+03,  7.72104375e+03,  1.09874786e+04,
        3.36274974e+04,  3.96458998e+04,  1.24009862e+04,  8.27911946e+03,
        1.62997125e+04,  1.53512540e+04,  9.80378566e+03,  9.59505894e+03,
        8.69326253e+03,  3.00683248e+03,  1.04714629e+04,  4.18508420e+03,
        1.11403446e+04,  1.55258706e+04,  6.86237635e+03,  1.64813993e+03,
        1.47497607e+04,  4.89973037e+02,  1.39044933e+04,  8.78881202e+03,
        1.34508860e+04,  3.58726854e+04,  3.36895299e+04,  3.56004722e+04,
        6.15566619e+03,  5.54387119e+03,  1.66690696e+04,  7.72741673e+03,
        3.78720253e+04,  5.28003905e+03,  7.99907758e+03,  1.04983305e+04,
        3.04572320e+04,  4.69820461e+03,  3.26370188e+03,  1.61014032e+04,
        3.17081781e+03,  6.30163522e+03,  9.75199240e+03, -5.82634248e+02,
        2.96851468e+04,  7.94428932e+03,  1.02645979e+04,  5.89515152e+03,
        7.89508657e+03,  1.18152268e+04,  2.92915606e+04,  9.75540252e+03,
        1.10809595e+04,  6.00365907e+03,  3.92512716e+03,  1.35923962e+03,
        8.09445242e+03,  1.13177049e+04,  1.06488981e+04,  8.94717438e+03,
        5.73546352e+03,  4.40113719e+03])
In [37]:
y_test
Out[37]:
764      9095.06825
887      5272.17580
890     29330.98315
1293     9301.89355
259     33750.29180
           ...     
644     18806.14547
602     11070.53500
731     10065.41300
321     24671.66334
479      1824.28540
Name: charges, Length: 442, dtype: float64
In [38]:
plt.scatter(y_test, predictions)
Out[38]:
<matplotlib.collections.PathCollection at 0x27d2684f1c0>
In [39]:
sns.distplot((y_test-predictions))
C:\Users\user\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

Out[39]:
<AxesSubplot:xlabel='charges', ylabel='Density'>

Model Evaluation

In [40]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
In [41]:
print(mean_absolute_error(y_test, predictions))
print(mean_squared_error(y_test, predictions))
#print(sqrt(mean_squared_error(y_test, predictions)))
4210.867850540483
35299792.10029327
In [42]:
lr.score(X_test, y_test)
Out[42]:
0.7591192126340146

Insurance Prediction

In [43]:
X.head()
Out[43]:
age bmi children smoker
0 19 27.900 0 1
1 18 33.770 1 0
2 28 33.000 3 0
3 33 22.705 0 0
4 32 28.880 0 0
In [44]:
pred = lr.predict([[19, 27.9,0,1]])
print('value charge to frederick insurance = ', pred)
value charge to frederick insurance =  [25505.58586529]
C:\Users\user\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

Logistic Regression (Predicting smoker or non-smoker)

In [45]:
df.head()
Out[45]:
age sex bmi children smoker region charges
0 19 0 27.900 0 1 southwest 16884.92400
1 18 1 33.770 1 0 southeast 1725.55230
2 28 1 33.000 3 0 southeast 4449.46200
3 33 1 22.705 0 0 northwest 21984.47061
4 32 1 28.880 0 0 northwest 3866.85520
In [46]:
from sklearn.model_selection import train_test_split
In [47]:
X= df.drop(['smoker', 'region'], axis=1)
y= df['smoker']
In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
In [49]:
from sklearn.linear_model import LogisticRegression
In [50]:
logmodel = LogisticRegression()
In [51]:
logmodel.fit(X_train, y_train)
Out[51]:
LogisticRegression()
In [52]:
predictions = logmodel.predict(X_test)
predictions
Out[52]:
array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0], dtype=int8)
In [53]:
y_test
Out[53]:
764     0
887     0
890     1
1293    0
259     1
       ..
644     0
602     0
731     0
321     0
479     0
Name: smoker, Length: 442, dtype: int8

Model Evaluation

In [54]:
from sklearn.metrics import classification_report
In [55]:
print(classification_report(y_test, predictions))
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       356
           1       0.81      0.81      0.81        86

    accuracy                           0.93       442
   macro avg       0.88      0.88      0.88       442
weighted avg       0.93      0.93      0.93       442

In [56]:
logmodel.score(X_test, y_test)
Out[56]:
0.9276018099547512
In [57]:
from sklearn.metrics import confusion_matrix
In [58]:
print(confusion_matrix(y_test, predictions))
[[340  16]
 [ 16  70]]

My Predications

In [59]:
logmodel.score(X_test, y_test)
Out[59]:
0.9276018099547512
In [60]:
X.head()
Out[60]:
age sex bmi children charges
0 19 0 27.900 0 16884.92400
1 18 1 33.770 1 1725.55230
2 28 1 33.000 3 4449.46200
3 33 1 22.705 0 21984.47061
4 32 1 28.880 0 3866.85520
In [61]:
y
Out[61]:
0       1
1       0
2       0
3       0
4       0
       ..
1333    0
1334    0
1335    0
1336    0
1337    1
Name: smoker, Length: 1338, dtype: int8
In [62]:
logmodel.predict([[19,0,27.900,0,99999.92400]])
C:\Users\user\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning:

X does not have valid feature names, but LogisticRegression was fitted with feature names

Out[62]:
array([1], dtype=int8)

Decision Tree

In [63]:
df.head()
Out[63]:
age sex bmi children smoker region charges
0 19 0 27.900 0 1 southwest 16884.92400
1 18 1 33.770 1 0 southeast 1725.55230
2 28 1 33.000 3 0 southeast 4449.46200
3 33 1 22.705 0 0 northwest 21984.47061
4 32 1 28.880 0 0 northwest 3866.85520
In [64]:
from sklearn.model_selection import train_test_split
In [65]:
X= df.drop(['smoker', 'region'], axis=1)
y= df['smoker']
In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
In [67]:
from sklearn.tree import DecisionTreeClassifier
In [68]:
dr = DecisionTreeClassifier()
In [69]:
dr.fit(X_train, y_train)
Out[69]:
DecisionTreeClassifier()
In [70]:
predict = dr.predict(X_test)
predict
Out[70]:
array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0], dtype=int8)
In [71]:
y_test
Out[71]:
764     0
887     0
890     1
1293    0
259     1
       ..
644     0
602     0
731     0
321     0
479     0
Name: smoker, Length: 442, dtype: int8
In [72]:
dr.score(X_test, y_test)
Out[72]:
0.9705882352941176
In [73]:
from sklearn.metrics import classification_report
In [74]:
print(classification_report(y_test, predictions))
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       356
           1       0.81      0.81      0.81        86

    accuracy                           0.93       442
   macro avg       0.88      0.88      0.88       442
weighted avg       0.93      0.93      0.93       442

In [75]:
from sklearn.metrics import confusion_matrix
In [76]:
print(confusion_matrix(y_test, predictions))
[[340  16]
 [ 16  70]]

My Predictions

In [77]:
X.head()
Out[77]:
age sex bmi children charges
0 19 0 27.900 0 16884.92400
1 18 1 33.770 1 1725.55230
2 28 1 33.000 3 4449.46200
3 33 1 22.705 0 21984.47061
4 32 1 28.880 0 3866.85520
In [78]:
dr.predict([[52,0,3,0, 500000]])
C:\Users\user\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning:

X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names

Out[78]:
array([1], dtype=int8)

Random Forest

In [79]:
from sklearn.model_selection import train_test_split
In [80]:
df.head()
Out[80]:
age sex bmi children smoker region charges
0 19 0 27.900 0 1 southwest 16884.92400
1 18 1 33.770 1 0 southeast 1725.55230
2 28 1 33.000 3 0 southeast 4449.46200
3 33 1 22.705 0 0 northwest 21984.47061
4 32 1 28.880 0 0 northwest 3866.85520
In [81]:
X = df.drop(['region', 'smoker'], axis=1)
y = df['smoker']
In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
In [83]:
from sklearn.ensemble import RandomForestClassifier
In [84]:
rf = RandomForestClassifier(n_estimators=20)
In [85]:
rf.fit(X_train, y_train)
Out[85]:
RandomForestClassifier(n_estimators=20)
In [86]:
rf.score(X_test, y_test)
Out[86]:
0.9660633484162896
In [87]:
predictions = rf.predict(X_test)
predictions
pd.DataFrame(predictions).head()
Out[87]:
0
0 0
1 0
2 1
3 0
4 1
In [88]:
y_test 
Out[88]:
764     0
887     0
890     1
1293    0
259     1
       ..
644     0
602     0
731     0
321     0
479     0
Name: smoker, Length: 442, dtype: int8
In [89]:
pd.DataFrame(y_test).head()
Out[89]:
smoker
764 0
887 0
890 1
1293 0
259 1
In [90]:
from sklearn.metrics import classification_report
In [91]:
print(classification_report(y_test, predictions))
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       356
           1       0.89      0.94      0.92        86

    accuracy                           0.97       442
   macro avg       0.94      0.96      0.95       442
weighted avg       0.97      0.97      0.97       442

In [92]:
from sklearn.metrics import confusion_matrix
In [93]:
print(confusion_matrix(y_test, predictions))
[[346  10]
 [  5  81]]
In [94]:
cm = confusion_matrix(y_test, predictions)
cm
Out[94]:
array([[346,  10],
       [  5,  81]], dtype=int64)
In [95]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True)
plt.xlabel('Predicted')
plt.ylabel('Truth')
Out[95]:
Text(95.72222222222221, 0.5, 'Truth')

K Fold Cross Valuation

In [96]:
from sklearn.model_selection import cross_val_score
In [97]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
In [98]:
cross_val_score(LogisticRegression(), X,y)
Out[98]:
array([0.92164179, 0.93283582, 0.93656716, 0.92883895, 0.94756554])
In [99]:
cross_val_score(SVC(), X,y)
Out[99]:
array([0.90671642, 0.89925373, 0.94776119, 0.91011236, 0.92883895])
In [100]:
cross_val_score(RandomForestClassifier(), X,y)
Out[100]:
array([0.95895522, 0.94776119, 0.97761194, 0.95505618, 0.98127341])
In [101]:
cross_val_score(RandomForestClassifier(n_estimators=40), X,y)
Out[101]:
array([0.94776119, 0.94402985, 0.98134328, 0.95505618, 0.98876404])
In [ ]: